!--------------------------------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations                              !
!   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
!                                                                                                  !
!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
!--------------------------------------------------------------------------------------------------!

! **************************************************************************************************
!> \brief Calls routines to get RI integrals and calculate total energies
!> \par History
!>      10.2011 created [Joost VandeVondele and Mauro Del Ben]
!>      07.2019 split from mp2_gpw.F [Frederick Stein]
! **************************************************************************************************
MODULE mp2_gpw
   USE atomic_kind_types,               ONLY: atomic_kind_type
   USE basis_set_types,                 ONLY: get_gto_basis_set,&
                                              gto_basis_set_p_type,&
                                              gto_basis_set_type
   USE cell_types,                      ONLY: cell_type,&
                                              get_cell
   USE cp_blacs_env,                    ONLY: BLACS_GRID_SQUARE,&
                                              cp_blacs_env_create,&
                                              cp_blacs_env_release,&
                                              cp_blacs_env_type
   USE cp_control_types,                ONLY: dft_control_type
   USE cp_dbcsr_api,                    ONLY: &
        dbcsr_clear_mempools, dbcsr_copy, dbcsr_create, dbcsr_distribution_release, &
        dbcsr_distribution_type, dbcsr_filter, dbcsr_init_p, dbcsr_iterator_blocks_left, &
        dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
        dbcsr_p_type, dbcsr_release, dbcsr_type, dbcsr_type_no_symmetry, dbcsr_type_symmetric
   USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
   USE cp_dbcsr_cp2k_link,              ONLY: cp_dbcsr_alloc_block_from_nbl
   USE cp_dbcsr_operations,             ONLY: cp_dbcsr_dist2d_to_dist,&
                                              cp_dbcsr_m_by_n_from_row_template
   USE cp_fm_types,                     ONLY: cp_fm_get_info,&
                                              cp_fm_release,&
                                              cp_fm_type
   USE cp_log_handling,                 ONLY: &
        cp_add_default_logger, cp_get_default_logger, cp_logger_create, &
        cp_logger_get_default_unit_nr, cp_logger_release, cp_logger_set, cp_logger_type, &
        cp_rm_default_logger, cp_to_string
   USE dbt_api,                         ONLY: dbt_type
   USE distribution_1d_types,           ONLY: distribution_1d_release,&
                                              distribution_1d_type
   USE distribution_2d_types,           ONLY: distribution_2d_release,&
                                              distribution_2d_type
   USE distribution_methods,            ONLY: distribute_molecules_1d,&
                                              distribute_molecules_2d
   USE group_dist_types,                ONLY: create_group_dist,&
                                              get_group_dist,&
                                              group_dist_d1_type,&
                                              release_group_dist
   USE hfx_types,                       ONLY: block_ind_type,&
                                              hfx_compression_type
   USE input_constants,                 ONLY: &
        do_eri_gpw, do_eri_os, do_potential_coulomb, do_potential_id, do_potential_truncated, &
        eri_default, mp2_method_gpw, ri_default, ri_mp2_method_gpw, rpa_exchange_none
   USE input_section_types,             ONLY: section_vals_val_get
   USE kinds,                           ONLY: dp
   USE kpoint_types,                    ONLY: kpoint_type
   USE libint_wrapper,                  ONLY: cp_libint_static_cleanup,&
                                              cp_libint_static_init
   USE machine,                         ONLY: default_output_unit,&
                                              m_flush
   USE message_passing,                 ONLY: mp_para_env_release,&
                                              mp_para_env_type
   USE molecule_kind_types,             ONLY: molecule_kind_type
   USE molecule_types,                  ONLY: molecule_type
   USE mp2_cphf,                        ONLY: solve_z_vector_eq
   USE mp2_gpw_method,                  ONLY: mp2_gpw_compute
   USE mp2_integrals,                   ONLY: mp2_ri_gpw_compute_in
   USE mp2_ri_gpw,                      ONLY: mp2_ri_gpw_compute_en
   USE mp2_ri_grad,                     ONLY: calc_ri_mp2_nonsep
   USE mp2_types,                       ONLY: mp2_type,&
                                              three_dim_real_array
   USE particle_methods,                ONLY: get_particle_set
   USE particle_types,                  ONLY: particle_type
   USE qs_environment_types,            ONLY: get_qs_env,&
                                              qs_environment_type
   USE qs_integral_utils,               ONLY: basis_set_list_setup
   USE qs_interactions,                 ONLY: init_interaction_radii
   USE qs_kind_types,                   ONLY: get_qs_kind,&
                                              qs_kind_type
   USE qs_ks_types,                     ONLY: qs_ks_env_type
   USE qs_mo_types,                     ONLY: get_mo_set,&
                                              mo_set_type
   USE qs_neighbor_list_types,          ONLY: neighbor_list_set_p_type,&
                                              release_neighbor_list_sets
   USE qs_neighbor_lists,               ONLY: atom2d_build,&
                                              atom2d_cleanup,&
                                              build_neighbor_lists,&
                                              local_atoms_type,&
                                              pair_radius_setup
   USE rpa_main,                        ONLY: rpa_ri_compute_en
   USE rpa_rse,                         ONLY: rse_energy
#include "./base/base_uses.f90"

   IMPLICIT NONE

   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_gpw'

   PUBLIC :: mp2_gpw_main, create_mat_munu, grep_rows_in_subgroups, build_dbcsr_from_rows

CONTAINS

! **************************************************************************************************
!> \brief with a big bang to mp2
!> \param qs_env ...
!> \param mp2_env ...
!> \param Emp2 ...
!> \param Emp2_Cou ...
!> \param Emp2_EX ...
!> \param Emp2_S ...
!> \param Emp2_T ...
!> \param mos_mp2 ...
!> \param para_env ...
!> \param unit_nr ...
!> \param calc_forces ...
!> \param calc_ex ...
!> \param do_ri_mp2 ...
!> \param do_ri_rpa ...
!> \param do_ri_sos_laplace_mp2 ...
!> \author Mauro Del Ben and Joost VandeVondele
! **************************************************************************************************
   SUBROUTINE mp2_gpw_main(qs_env, mp2_env, Emp2, Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, &
                           mos_mp2, para_env, unit_nr, calc_forces, calc_ex, do_ri_mp2, do_ri_rpa, &
                           do_ri_sos_laplace_mp2)
      TYPE(qs_environment_type), POINTER                 :: qs_env
      TYPE(mp2_type)                                     :: mp2_env
      REAL(KIND=dp), INTENT(OUT)                         :: Emp2, Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
      TYPE(mo_set_type), DIMENSION(:), INTENT(IN)        :: mos_mp2
      TYPE(mp_para_env_type), POINTER                    :: para_env
      INTEGER, INTENT(IN)                                :: unit_nr
      LOGICAL, INTENT(IN)                                :: calc_forces, calc_ex
      LOGICAL, INTENT(IN), OPTIONAL                      :: do_ri_mp2, do_ri_rpa, &
                                                            do_ri_sos_laplace_mp2

      CHARACTER(LEN=*), PARAMETER                        :: routineN = 'mp2_gpw_main'

      INTEGER :: blacs_grid_layout, bse_lev_virt, color_sub, dimen_RI, dimen_RI_red, eri_method, &
         handle, ispin, local_unit_nr, my_group_L_end, my_group_L_size, my_group_L_start, nmo, &
         nspins, potential_type, ri_metric_type
      INTEGER, ALLOCATABLE, DIMENSION(:) :: ends_array_mc, ends_array_mc_block, gw_corr_lev_occ, &
         gw_corr_lev_virt, homo, starts_array_mc, starts_array_mc_block
      INTEGER, DIMENSION(3)                              :: periodic
      LOGICAL :: blacs_repeatable, do_bse, do_im_time, do_kpoints_cubic_RPA, my_do_gw, &
         my_do_ri_mp2, my_do_ri_rpa, my_do_ri_sos_laplace_mp2
      REAL(KIND=dp)                                      :: Emp2_AB, Emp2_BB, Emp2_Cou_BB, &
                                                            Emp2_EX_BB, eps_gvg_rspace_old, &
                                                            eps_pgf_orb_old, eps_rho_rspace_old
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: Eigenval
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: BIb_C_bse_ab, BIb_C_bse_ij
      REAL(KIND=dp), DIMENSION(:), POINTER               :: mo_eigenvalues
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(block_ind_type), ALLOCATABLE, &
         DIMENSION(:, :, :)                              :: t_3c_O_ind
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub, blacs_env_sub_mat_munu
      TYPE(cp_fm_type)                                   :: fm_matrix_PQ
      TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: mo_coeff
      TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: fm_matrix_L_kpoints, fm_matrix_Minv, &
                                                            fm_matrix_Minv_L_kpoints, &
                                                            fm_matrix_Minv_Vtrunc_Minv
      TYPE(cp_fm_type), POINTER                          :: mo_coeff_ptr
      TYPE(cp_logger_type), POINTER                      :: logger, logger_sub
      TYPE(dbcsr_p_type)                                 :: mat_munu, mat_P_global
      TYPE(dbcsr_p_type), ALLOCATABLE, DIMENSION(:)      :: mo_coeff_all, mo_coeff_gw, mo_coeff_o, &
                                                            mo_coeff_o_bse, mo_coeff_v, &
                                                            mo_coeff_v_bse
      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: matrix_s
      TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: matrix_s_kp
      TYPE(dbt_type)                                     :: t_3c_M
      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :)       :: t_3c_O
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(group_dist_d1_type)                           :: gd_array, gd_B_all, gd_B_occ_bse, &
                                                            gd_B_virt_bse
      TYPE(group_dist_d1_type), ALLOCATABLE, &
         DIMENSION(:)                                    :: gd_B_virtual
      TYPE(hfx_compression_type), ALLOCATABLE, &
         DIMENSION(:, :, :)                              :: t_3c_O_compressed
      TYPE(kpoint_type), POINTER                         :: kpoints, kpoints_from_DFT
      TYPE(mo_set_type), DIMENSION(:), POINTER           :: mos
      TYPE(mp_para_env_type), POINTER                    :: para_env_sub
      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
         POINTER                                         :: sab_orb_sub
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(qs_ks_env_type), POINTER                      :: ks_env
      TYPE(three_dim_real_array), ALLOCATABLE, &
         DIMENSION(:)                                    :: BIb_C, BIb_C_gw

      CALL timeset(routineN, handle)

      ! check if we want to do ri-mp2
      my_do_ri_mp2 = .FALSE.
      IF (PRESENT(do_ri_mp2)) my_do_ri_mp2 = do_ri_mp2

      ! check if we want to do ri-rpa
      my_do_ri_rpa = .FALSE.
      IF (PRESENT(do_ri_rpa)) my_do_ri_rpa = do_ri_rpa

      ! check if we want to do ri-sos-laplace-mp2
      my_do_ri_sos_laplace_mp2 = .FALSE.
      IF (PRESENT(do_ri_sos_laplace_mp2)) my_do_ri_sos_laplace_mp2 = do_ri_sos_laplace_mp2

      ! GW and SOS-MP2 cannot be used together
      IF (my_do_ri_sos_laplace_mp2) THEN
         CPASSERT(.NOT. mp2_env%ri_rpa%do_ri_g0w0)
      END IF

      ! check if we want to do imaginary time
      do_im_time = mp2_env%do_im_time
      do_bse = qs_env%mp2_env%bse%do_bse
      do_kpoints_cubic_RPA = qs_env%mp2_env%ri_rpa_im_time%do_im_time_kpoints

      IF (do_kpoints_cubic_RPA .AND. mp2_env%ri_rpa%do_ri_g0w0) THEN
         CPABORT("Full RPA k-points (DO_KPOINTS in LOW_SCALING section) not implemented with GW")
      END IF

      ! Get the number of spins
      nspins = SIZE(mos_mp2)

      ! ... setup needed to be able to qs_integrate in a subgroup.
      IF (do_kpoints_cubic_RPA) THEN
         CALL get_qs_env(qs_env=qs_env, dft_control=dft_control, kpoints=kpoints_from_DFT)
         mos(1:nspins) => kpoints_from_DFT%kp_env(1)%kpoint_env%mos(1:nspins, 1)
      ELSE
         CALL get_qs_env(qs_env=qs_env, dft_control=dft_control, mos=mos)
      END IF
      CALL get_mo_set(mo_set=mos_mp2(1), nmo=nmo)
      ALLOCATE (homo(nspins), Eigenval(nmo, nspins), mo_coeff(nspins))
      DO ispin = 1, nspins
         CALL get_mo_set(mo_set=mos_mp2(ispin), &
                         eigenvalues=mo_eigenvalues, homo=homo(ispin), &
                         mo_coeff=mo_coeff_ptr)
         mo_coeff(ispin) = mo_coeff_ptr
         Eigenval(:, ispin) = mo_eigenvalues(1:nmo)
      END DO

      ! a para_env
      color_sub = para_env%mepos/mp2_env%mp2_num_proc
      ALLOCATE (para_env_sub)
      CALL para_env_sub%from_split(para_env, color_sub)

      ! each of the sub groups might need to generate output
      logger => cp_get_default_logger()
      IF (para_env%is_source()) THEN
         local_unit_nr = cp_logger_get_default_unit_nr(logger, local=.FALSE.)
      ELSE
         local_unit_nr = default_output_unit
      END IF

      ! get stuff
      CALL get_qs_env(qs_env, &
                      ks_env=ks_env, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      particle_set=particle_set, &
                      atomic_kind_set=atomic_kind_set, &
                      dft_control=dft_control, &
                      matrix_s_kp=matrix_s_kp)

      CALL get_cell(cell=cell, periodic=periodic)

      IF (do_im_time) THEN
         IF (mp2_env%ri_metric%potential_type == ri_default) THEN
            IF (SUM(periodic) == 1 .OR. SUM(periodic) == 3) THEN
               mp2_env%ri_metric%potential_type = do_potential_id
            ELSE
               mp2_env%ri_metric%potential_type = do_potential_truncated
            END IF
         END IF

         ! statically initialize libint
         CALL cp_libint_static_init()

      END IF

      IF (mp2_env%ri_metric%potential_type == ri_default) THEN
         mp2_env%ri_metric%potential_type = do_potential_coulomb
      END IF

      IF (mp2_env%eri_method == eri_default) THEN
         IF (SUM(periodic) > 0) mp2_env%eri_method = do_eri_gpw
         IF (SUM(periodic) == 0) mp2_env%eri_method = do_eri_os
         IF (SUM(mp2_env%ri_rpa_im_time%kp_grid) > 0) mp2_env%eri_method = do_eri_os
         IF (mp2_env%method == mp2_method_gpw) mp2_env%eri_method = do_eri_gpw
         IF (mp2_env%method == ri_mp2_method_gpw) mp2_env%eri_method = do_eri_gpw
         IF (mp2_env%ri_rpa_im_time%do_im_time_kpoints) mp2_env%eri_method = do_eri_os
         IF (calc_forces .AND. mp2_env%eri_method == do_eri_os) mp2_env%eri_method = do_eri_gpw
      END IF
      eri_method = mp2_env%eri_method

      IF (unit_nr > 0 .AND. mp2_env%eri_method == do_eri_gpw) THEN
         WRITE (UNIT=unit_nr, FMT="(T3,A,T71,F10.1)") &
            "GPW_INFO| Density cutoff [a.u.]:", mp2_env%mp2_gpw%cutoff*0.5_dp
         WRITE (UNIT=unit_nr, FMT="(T3,A,T71,F10.1)") &
            "GPW_INFO| Relative density cutoff [a.u.]:", mp2_env%mp2_gpw%relative_cutoff*0.5_dp
         CALL m_flush(unit_nr)
      END IF

      ! MG: Disable logger layer for BSE, misses some key information to print cube files properly
      IF (.NOT. (mp2_env%ri_g0w0%print_local_bandgap .OR. mp2_env%bse%do_nto_analysis)) THEN
         ! a logger
         NULLIFY (logger_sub)
         CALL cp_logger_create(logger_sub, para_env=para_env_sub, &
                               default_global_unit_nr=local_unit_nr, &
                               close_global_unit_on_dealloc=.FALSE.)
         CALL cp_logger_set(logger_sub, local_filename="MP2_localLog")
         ! set to a custom print level (we could also have a different print level for para_env%source)
         logger_sub%iter_info%print_level = mp2_env%mp2_gpw%print_level
         CALL cp_add_default_logger(logger_sub)
      END IF

      ! a blacs_env (ignore the globenv stored defaults for now)
      blacs_grid_layout = BLACS_GRID_SQUARE
      blacs_repeatable = .TRUE.
      NULLIFY (blacs_env_sub)
      CALL cp_blacs_env_create(blacs_env_sub, para_env_sub, &
                               blacs_grid_layout, &
                               blacs_repeatable)

      blacs_env_sub_mat_munu => blacs_env_sub

      matrix_s(1:1) => matrix_s_kp(1:1, 1)

      CALL get_eps_old(dft_control, eps_pgf_orb_old, eps_rho_rspace_old, eps_gvg_rspace_old)

      CALL create_mat_munu(mat_munu, qs_env, mp2_env%mp2_gpw%eps_grid, &
                           blacs_env_sub_mat_munu, do_alloc_blocks_from_nbl=.NOT. do_im_time, sab_orb_sub=sab_orb_sub, &
                           do_kpoints=mp2_env%ri_rpa_im_time%do_im_time_kpoints, &
                           dbcsr_sym_type=dbcsr_type_symmetric)

      ! which RI metric we want to have
      ri_metric_type = mp2_env%ri_metric%potential_type

      ! which interaction potential
      potential_type = mp2_env%potential_parameter%potential_type

      ! check if we want to do ri-g0w0 on top of ri-rpa
      my_do_gw = mp2_env%ri_rpa%do_ri_g0w0
      ALLOCATE (gw_corr_lev_occ(nspins), gw_corr_lev_virt(nspins))
      gw_corr_lev_occ(1) = mp2_env%ri_g0w0%corr_mos_occ
      gw_corr_lev_virt(1) = mp2_env%ri_g0w0%corr_mos_virt
      IF (nspins == 2) THEN
         gw_corr_lev_occ(2) = mp2_env%ri_g0w0%corr_mos_occ_beta
         gw_corr_lev_virt(2) = mp2_env%ri_g0w0%corr_mos_virt_beta
      END IF

      IF (do_bse) THEN
         IF (nspins > 1) THEN
            CPABORT("BSE not implemented for open shell calculations")
         END IF
         !Keep default behavior for occupied
         ! We do not implement an explicit bse_lev_occ here, because the small number of occupied levels
         ! does not critically influence the memory
         bse_lev_virt = gw_corr_lev_virt(1)
      END IF

      ! After the components are inside of the routines, we can move this line insight the branch
      ALLOCATE (mo_coeff_o(nspins), mo_coeff_v(nspins), mo_coeff_all(nspins), mo_coeff_gw(nspins))

      ! Always allocate for usage in call of replicate_mat_to_subgroup
      ALLOCATE (mo_coeff_o_bse(1), mo_coeff_v_bse(1))

      ! for imag. time, we do not need this
      IF (.NOT. do_im_time) THEN

         ! new routine: replicate a full matrix from one para_env to a smaller one
         ! keeping the memory usage as small as possible in this case the
         ! output the two part of the C matrix (virtual, occupied)
         DO ispin = 1, nspins

            CALL replicate_mat_to_subgroup(para_env, para_env_sub, mo_coeff(ispin), homo(ispin), mat_munu%matrix, &
                                           mo_coeff_o(ispin)%matrix, mo_coeff_v(ispin)%matrix, &
                                           mo_coeff_all(ispin)%matrix, mo_coeff_gw(ispin)%matrix, &
                                           my_do_gw, gw_corr_lev_occ(ispin), gw_corr_lev_virt(ispin), do_bse, &
                                           bse_lev_virt, mo_coeff_o_bse(1)%matrix, mo_coeff_v_bse(1)%matrix, &
                                           mp2_env%mp2_gpw%eps_filter)

         END DO

      END IF

      ! now we're kind of ready to go....
      Emp2_S = 0.0_dp
      Emp2_T = 0.0_dp
      IF (my_do_ri_mp2 .OR. my_do_ri_rpa .OR. my_do_ri_sos_laplace_mp2) THEN
         ! RI-GPW integrals (same stuff for both RPA and MP2)
         IF (nspins == 2) THEN
            ! open shell case (RI) here the (ia|K) integrals are computed for both the alpha and beta components
            CALL mp2_ri_gpw_compute_in( &
               BIb_C, BIb_C_gw, BIb_C_bse_ij, BIb_C_bse_ab, gd_array, gd_B_virtual, dimen_RI, dimen_RI_red, qs_env, &
               para_env, para_env_sub, color_sub, cell, particle_set, &
               atomic_kind_set, qs_kind_set, fm_matrix_PQ, fm_matrix_L_kpoints, fm_matrix_Minv_L_kpoints, &
               fm_matrix_Minv, fm_matrix_Minv_Vtrunc_Minv, nmo, homo, mat_munu, sab_orb_sub, &
               mo_coeff_o, mo_coeff_v, mo_coeff_all, mo_coeff_gw, mo_coeff_o_bse, mo_coeff_v_bse, &
               mp2_env%mp2_gpw%eps_filter, unit_nr, &
               mp2_env%mp2_memory, mp2_env%calc_PQ_cond_num, calc_forces, blacs_env_sub, my_do_gw .AND. .NOT. do_im_time, &
               do_bse, gd_B_all, starts_array_mc, ends_array_mc, starts_array_mc_block, ends_array_mc_block, &
               gw_corr_lev_occ(1), gw_corr_lev_virt(1), &
               bse_lev_virt, &
               do_im_time, do_kpoints_cubic_RPA, kpoints, &
               t_3c_M, t_3c_O, t_3c_O_compressed, t_3c_O_ind, &
               mp2_env%ri_metric, &
               gd_B_occ_bse, gd_B_virt_bse)
         ELSE
            ! closed shell case (RI)
            CALL mp2_ri_gpw_compute_in(BIb_C, BIb_C_gw, BIb_C_bse_ij, BIb_C_bse_ab, gd_array, gd_B_virtual, &
                                       dimen_RI, dimen_RI_red, qs_env, para_env, para_env_sub, &
                                       color_sub, cell, particle_set, &
                                       atomic_kind_set, qs_kind_set, fm_matrix_PQ, &
                                       fm_matrix_L_kpoints, fm_matrix_Minv_L_kpoints, &
                                       fm_matrix_Minv, fm_matrix_Minv_Vtrunc_Minv, nmo, homo, &
                                       mat_munu, sab_orb_sub, &
                                       mo_coeff_o, mo_coeff_v, mo_coeff_all, mo_coeff_gw, mo_coeff_o_bse, mo_coeff_v_bse, &
                                       mp2_env%mp2_gpw%eps_filter, unit_nr, &
                                       mp2_env%mp2_memory, mp2_env%calc_PQ_cond_num, calc_forces, &
                                       blacs_env_sub, my_do_gw .AND. .NOT. do_im_time, do_bse, gd_B_all, &
                                       starts_array_mc, ends_array_mc, &
                                       starts_array_mc_block, ends_array_mc_block, &
                                       gw_corr_lev_occ(1), gw_corr_lev_virt(1), &
                                       bse_lev_virt, &
                                       do_im_time, do_kpoints_cubic_RPA, kpoints, &
                                       t_3c_M, t_3c_O, t_3c_O_compressed, t_3c_O_ind, &
                                       mp2_env%ri_metric, gd_B_occ_bse, gd_B_virt_bse)
         END IF

      ELSE
         ! Canonical MP2-GPW
         IF (nspins == 2) THEN
            ! alpha-alpha and alpha-beta components
            IF (unit_nr > 0) WRITE (unit_nr, *)
            IF (unit_nr > 0) WRITE (unit_nr, '(T3,A)') 'Alpha (ia|'
            CALL mp2_gpw_compute( &
               Emp2, Emp2_Cou, Emp2_EX, qs_env, para_env, para_env_sub, color_sub, &
               cell, particle_set, &
               atomic_kind_set, qs_kind_set, Eigenval, nmo, homo, mat_munu, &
               sab_orb_sub, mo_coeff_o, mo_coeff_v, mp2_env%mp2_gpw%eps_filter, unit_nr, &
               mp2_env%mp2_memory, calc_ex, blacs_env_sub, Emp2_AB)

            ! beta-beta component
            IF (unit_nr > 0) WRITE (unit_nr, *)
            IF (unit_nr > 0) WRITE (unit_nr, '(T3,A)') 'Beta (ia|'
            CALL mp2_gpw_compute( &
               Emp2_BB, Emp2_Cou_BB, Emp2_EX_BB, qs_env, para_env, para_env_sub, color_sub, cell, particle_set, &
               atomic_kind_set, qs_kind_set, Eigenval(:, 2:2), nmo, homo(2:2), mat_munu, &
               sab_orb_sub, mo_coeff_o(2:2), mo_coeff_v(2:2), mp2_env%mp2_gpw%eps_filter, unit_nr, &
               mp2_env%mp2_memory, calc_ex, blacs_env_sub)

            ! make order on the MP2 energy contributions
            Emp2_Cou = Emp2_Cou*0.25_dp
            Emp2_EX = Emp2_EX*0.5_dp

            Emp2_Cou_BB = Emp2_Cou_BB*0.25_dp
            Emp2_EX_BB = Emp2_EX_BB*0.5_dp

            Emp2_S = Emp2_AB
            Emp2_T = Emp2_Cou + Emp2_Cou_BB + Emp2_EX + Emp2_EX_BB

            Emp2_Cou = Emp2_Cou + Emp2_Cou_BB + Emp2_AB
            Emp2_EX = Emp2_EX + Emp2_EX_BB
            Emp2 = Emp2_EX + Emp2_Cou

         ELSE
            ! closed shell case
            CALL mp2_gpw_compute( &
               Emp2, Emp2_Cou, Emp2_EX, qs_env, para_env, para_env_sub, color_sub, cell, particle_set, &
               atomic_kind_set, qs_kind_set, Eigenval(:, 1:1), nmo, homo(1:1), mat_munu, &
               sab_orb_sub, mo_coeff_o(1:1), mo_coeff_v(1:1), mp2_env%mp2_gpw%eps_filter, unit_nr, &
               mp2_env%mp2_memory, calc_ex, blacs_env_sub)
         END IF
      END IF

      ! Free possibly large buffers allocated by dbcsr on the GPU,
      ! large hybrid dgemm/pdgemm's coming later will need the space.
      CALL dbcsr_clear_mempools()

      IF (calc_forces .AND. .NOT. do_im_time) THEN
         ! make a copy of mo_coeff_o and mo_coeff_v
         ALLOCATE (mp2_env%ri_grad%mo_coeff_o(nspins), mp2_env%ri_grad%mo_coeff_v(nspins))
         DO ispin = 1, nspins
            NULLIFY (mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
            CALL dbcsr_init_p(mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
            CALL dbcsr_copy(mp2_env%ri_grad%mo_coeff_o(ispin)%matrix, mo_coeff_o(ispin)%matrix, &
                            name="mo_coeff_o"//cp_to_string(ispin))
            NULLIFY (mp2_env%ri_grad%mo_coeff_v(ispin)%matrix)
            CALL dbcsr_init_p(mp2_env%ri_grad%mo_coeff_v(ispin)%matrix)
            CALL dbcsr_copy(mp2_env%ri_grad%mo_coeff_v(ispin)%matrix, mo_coeff_v(ispin)%matrix, &
                            name="mo_coeff_v"//cp_to_string(ispin))
         END DO
         CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
      END IF
      ! Copy mo coeffs for RPA exchange correction
      IF (mp2_env%ri_rpa%exchange_correction /= rpa_exchange_none) THEN
         ALLOCATE (mp2_env%ri_rpa%mo_coeff_o(nspins), mp2_env%ri_rpa%mo_coeff_v(nspins))
         DO ispin = 1, nspins
            CALL dbcsr_copy(mp2_env%ri_rpa%mo_coeff_o(ispin), mo_coeff_o(ispin)%matrix, name="mo_coeff_o")
            CALL dbcsr_copy(mp2_env%ri_rpa%mo_coeff_v(ispin), mo_coeff_v(ispin)%matrix, name="mo_coeff_v")
         END DO
      END IF

      IF (.NOT. do_im_time) THEN

         DO ispin = 1, nspins
            CALL dbcsr_release(mo_coeff_o(ispin)%matrix)
            DEALLOCATE (mo_coeff_o(ispin)%matrix)
            CALL dbcsr_release(mo_coeff_v(ispin)%matrix)
            DEALLOCATE (mo_coeff_v(ispin)%matrix)
            IF (my_do_gw) THEN
               CALL dbcsr_release(mo_coeff_all(ispin)%matrix)
               DEALLOCATE (mo_coeff_all(ispin)%matrix)
            END IF
         END DO
         DEALLOCATE (mo_coeff_o, mo_coeff_v)
         IF (my_do_gw) DEALLOCATE (mo_coeff_all)

      END IF
      IF (do_bse) THEN
         CALL dbcsr_release(mo_coeff_o_bse(1)%matrix)
         CALL dbcsr_release(mo_coeff_v_bse(1)%matrix)
         DEALLOCATE (mo_coeff_o_bse(1)%matrix)
         DEALLOCATE (mo_coeff_v_bse(1)%matrix)
      END IF
      DEALLOCATE (mo_coeff_o_bse, mo_coeff_v_bse)

      ! Release some memory for RPA exchange correction
      IF (calc_forces .AND. do_im_time .OR. &
          (.NOT. calc_forces .AND. mp2_env%ri_rpa%exchange_correction == rpa_exchange_none)) THEN

         CALL dbcsr_release(mat_munu%matrix)
         DEALLOCATE (mat_munu%matrix)

         CALL release_neighbor_list_sets(sab_orb_sub)

      END IF

      ! decide if to do RI-RPA or RI-MP2
      IF (my_do_ri_rpa .OR. my_do_ri_sos_laplace_mp2) THEN

         IF (do_im_time) CALL create_matrix_P(mat_P_global, qs_env, mp2_env, para_env)

         IF (.NOT. ALLOCATED(BIb_C)) ALLOCATE (BIb_C(nspins))
         IF (.NOT. ALLOCATED(BIb_C_gw)) ALLOCATE (BIb_C_gw(nspins))
         IF (.NOT. ALLOCATED(gd_B_virtual)) ALLOCATE (gd_B_virtual(nspins))

         ! RI-RPA
         CALL rpa_ri_compute_en(qs_env, Emp2, mp2_env, BIb_C, BIb_C_gw, BIb_C_bse_ij, BIb_C_bse_ab, &
                                para_env, para_env_sub, color_sub, &
                                gd_array, gd_B_virtual, gd_B_all, gd_B_occ_bse, gd_B_virt_bse, &
                                mo_coeff, fm_matrix_PQ, fm_matrix_L_kpoints, fm_matrix_Minv_L_kpoints, &
                                fm_matrix_Minv, fm_matrix_Minv_Vtrunc_Minv, kpoints, &
                                Eigenval, nmo, homo, dimen_RI, dimen_RI_red, gw_corr_lev_occ, gw_corr_lev_virt, &
                                bse_lev_virt, &
                                unit_nr, my_do_ri_sos_laplace_mp2, my_do_gw, do_im_time, do_bse, matrix_s, &
                                mat_munu, mat_P_global, t_3c_M, t_3c_O, t_3c_O_compressed, t_3c_O_ind, &
                                starts_array_mc, ends_array_mc, &
                                starts_array_mc_block, ends_array_mc_block, calc_forces)

         IF (mp2_env%ri_rpa%do_rse) &
            CALL rse_energy(qs_env, mp2_env, para_env, dft_control, mo_coeff, homo, Eigenval)

         IF (do_im_time) THEN
            IF (ASSOCIATED(mat_P_global%matrix)) THEN
               CALL dbcsr_release(mat_P_global%matrix)
               DEALLOCATE (mat_P_global%matrix)
            END IF

            CALL cp_libint_static_cleanup()
            IF (calc_forces) CALL cp_fm_release(fm_matrix_PQ)
         END IF

         ! Release some memory for RPA exchange correction
         IF (mp2_env%ri_rpa%exchange_correction /= rpa_exchange_none) THEN

            CALL dbcsr_release(mat_munu%matrix)
            DEALLOCATE (mat_munu%matrix)

            CALL release_neighbor_list_sets(sab_orb_sub)

         END IF

      ELSE
         IF (my_do_ri_mp2) THEN
            Emp2 = 0.0_dp
            Emp2_Cou = 0.0_dp
            Emp2_EX = 0.0_dp

            ! RI-MP2-GPW compute energy
            CALL mp2_ri_gpw_compute_en( &
               Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
               gd_array, gd_B_virtual, &
               Eigenval, nmo, homo, dimen_RI_red, unit_nr, calc_forces, calc_ex)

         END IF
      END IF

      ! if we need forces time to calculate the MP2 non-separable contribution
      ! and start computing the Lagrangian
      IF (calc_forces .AND. .NOT. do_im_time) THEN

         CALL calc_ri_mp2_nonsep(qs_env, mp2_env, para_env, para_env_sub, cell, &
                                 particle_set, atomic_kind_set, qs_kind_set, &
                                 mo_coeff, dimen_RI, Eigenval, &
                                 my_group_L_start, my_group_L_end, my_group_L_size, &
                                 sab_orb_sub, mat_munu, blacs_env_sub)

         DO ispin = 1, nspins
            CALL dbcsr_release(mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
            DEALLOCATE (mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)

            CALL dbcsr_release(mp2_env%ri_grad%mo_coeff_v(ispin)%matrix)
            DEALLOCATE (mp2_env%ri_grad%mo_coeff_v(ispin)%matrix)
         END DO
         DEALLOCATE (mp2_env%ri_grad%mo_coeff_o, mp2_env%ri_grad%mo_coeff_v)

         CALL dbcsr_release(mat_munu%matrix)
         DEALLOCATE (mat_munu%matrix)

         CALL release_neighbor_list_sets(sab_orb_sub)

      END IF

      !XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx
      ! moved from above
      IF (my_do_gw .AND. .NOT. do_im_time) THEN
         DO ispin = 1, nspins
            CALL dbcsr_release(mo_coeff_gw(ispin)%matrix)
            DEALLOCATE (mo_coeff_gw(ispin)%matrix)
         END DO
         DEALLOCATE (mo_coeff_gw)
      END IF

      ! re-init the radii to be able to generate pair lists with MP2-appropriate screening
      dft_control%qs_control%eps_pgf_orb = eps_pgf_orb_old
      dft_control%qs_control%eps_rho_rspace = eps_rho_rspace_old
      dft_control%qs_control%eps_gvg_rspace = eps_gvg_rspace_old
      CALL init_interaction_radii(dft_control%qs_control, qs_kind_set)

      CALL cp_blacs_env_release(blacs_env_sub)

      IF (.NOT. (mp2_env%ri_g0w0%print_local_bandgap .OR. mp2_env%bse%do_nto_analysis)) THEN
         CALL cp_rm_default_logger()
         CALL cp_logger_release(logger_sub)
      END IF

      CALL mp_para_env_release(para_env_sub)

      ! finally solve the z-vector equation if forces are required
      IF (calc_forces .AND. .NOT. do_im_time) THEN
         CALL solve_z_vector_eq(qs_env, mp2_env, para_env, dft_control, &
                                mo_coeff, homo, Eigenval, unit_nr)
      END IF

      DEALLOCATE (Eigenval, mo_coeff)

      CALL timestop(handle)

   END SUBROUTINE mp2_gpw_main

! **************************************************************************************************
!> \brief ...
!> \param para_env ...
!> \param para_env_sub ...
!> \param mo_coeff ...
!> \param homo ...
!> \param mat_munu ...
!> \param mo_coeff_o ...
!> \param mo_coeff_v ...
!> \param mo_coeff_all ...
!> \param mo_coeff_gw ...
!> \param my_do_gw ...
!> \param gw_corr_lev_occ ...
!> \param gw_corr_lev_virt ...
!> \param my_do_bse ...
!> \param bse_lev_virt ...
!> \param mo_coeff_o_bse ...
!> \param mo_coeff_v_bse ...
!> \param eps_filter ...
! **************************************************************************************************
   SUBROUTINE replicate_mat_to_subgroup(para_env, para_env_sub, mo_coeff, homo, mat_munu, &
                                        mo_coeff_o, mo_coeff_v, mo_coeff_all, mo_coeff_gw, my_do_gw, &
                                        gw_corr_lev_occ, gw_corr_lev_virt, my_do_bse, &
                                        bse_lev_virt, mo_coeff_o_bse, mo_coeff_v_bse, eps_filter)
      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
      TYPE(cp_fm_type), INTENT(IN)                       :: mo_coeff
      INTEGER, INTENT(IN)                                :: homo
      TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_munu
      TYPE(dbcsr_type), POINTER                          :: mo_coeff_o, mo_coeff_v, mo_coeff_all, &
                                                            mo_coeff_gw
      LOGICAL, INTENT(IN)                                :: my_do_gw
      INTEGER, INTENT(IN)                                :: gw_corr_lev_occ, gw_corr_lev_virt
      LOGICAL, INTENT(IN)                                :: my_do_bse
      INTEGER, INTENT(IN)                                :: bse_lev_virt
      TYPE(dbcsr_type), POINTER                          :: mo_coeff_o_bse, mo_coeff_v_bse
      REAL(KIND=dp), INTENT(IN)                          :: eps_filter

      CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_mat_to_subgroup'

      INTEGER                                            :: handle
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: C
      TYPE(group_dist_d1_type)                           :: gd_array

      CALL timeset(routineN, handle)

      CALL grep_rows_in_subgroups(para_env, para_env_sub, mo_coeff, gd_array, C)

      ! create and fill mo_coeff_o, mo_coeff_v and mo_coeff_all
      ALLOCATE (mo_coeff_o)
      CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_o, C(:, 1:homo), &
                                 mat_munu, gd_array, eps_filter)

      ALLOCATE (mo_coeff_v)
      CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_v, C(:, homo + 1:), &
                                 mat_munu, gd_array, eps_filter)

      IF (my_do_gw) THEN
         ALLOCATE (mo_coeff_gw)
         CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_gw, C(:, homo - gw_corr_lev_occ + 1:homo + gw_corr_lev_virt), &
                                    mat_munu, gd_array, eps_filter)

         ! all levels
         ALLOCATE (mo_coeff_all)
         CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_all, C, &
                                    mat_munu, gd_array, eps_filter)

      END IF

      IF (my_do_bse) THEN

         ALLOCATE (mo_coeff_o_bse)
         CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_o_bse, C(:, 1:homo), &
                                    mat_munu, gd_array, eps_filter)

         ALLOCATE (mo_coeff_v_bse)
         CALL build_dbcsr_from_rows(para_env_sub, mo_coeff_v_bse, C(:, homo + 1:homo + bse_lev_virt), &
                                    mat_munu, gd_array, eps_filter)

      END IF
      DEALLOCATE (C)
      CALL release_group_dist(gd_array)

      CALL timestop(handle)

   END SUBROUTINE replicate_mat_to_subgroup

! **************************************************************************************************
!> \brief ...
!> \param para_env ...
!> \param para_env_sub ...
!> \param mo_coeff ...
!> \param gd_array ...
!> \param C ...
! **************************************************************************************************
   SUBROUTINE grep_rows_in_subgroups(para_env, para_env_sub, mo_coeff, gd_array, C)
      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
      TYPE(cp_fm_type), INTENT(IN)                       :: mo_coeff
      TYPE(group_dist_d1_type), INTENT(OUT)              :: gd_array
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
         INTENT(OUT)                                     :: C

      CHARACTER(LEN=*), PARAMETER :: routineN = 'grep_rows_in_subgroups'

      INTEGER :: handle, i_global, iiB, j_global, jjB, max_row_col_local, my_mu_end, my_mu_size, &
         my_mu_start, ncol_global, ncol_local, ncol_rec, nrow_global, nrow_local, nrow_rec, &
         proc_receive_static, proc_send_static, proc_shift
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: local_col_row_info, rec_col_row_info
      INTEGER, DIMENSION(:), POINTER                     :: col_indices, col_indices_rec, &
                                                            row_indices, row_indices_rec
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: local_C, rec_C
      REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
         POINTER                                         :: local_C_internal

      CALL timeset(routineN, handle)

      CALL cp_fm_get_info(matrix=mo_coeff, &
                          ncol_global=ncol_global, &
                          nrow_global=nrow_global, &
                          nrow_local=nrow_local, &
                          ncol_local=ncol_local, &
                          row_indices=row_indices, &
                          col_indices=col_indices, &
                          local_data=local_C_internal)

      CALL create_group_dist(gd_array, para_env_sub%num_pe, nrow_global)
      CALL get_group_dist(gd_array, para_env_sub%mepos, my_mu_start, my_mu_end, my_mu_size)

      ! local storage for the C matrix
      ALLOCATE (C(my_mu_size, ncol_global))
      C = 0.0_dp

      ALLOCATE (local_C(nrow_local, ncol_local))
      local_C(:, :) = local_C_internal(1:nrow_local, 1:ncol_local)
      NULLIFY (local_C_internal)

      max_row_col_local = MAX(nrow_local, ncol_local)
      CALL para_env%max(max_row_col_local)

      ALLOCATE (local_col_row_info(0:max_row_col_local, 2))
      local_col_row_info = 0
      ! 0,1 nrows
      local_col_row_info(0, 1) = nrow_local
      local_col_row_info(1:nrow_local, 1) = row_indices(1:nrow_local)
      ! 0,2 ncols
      local_col_row_info(0, 2) = ncol_local
      local_col_row_info(1:ncol_local, 2) = col_indices(1:ncol_local)

      ALLOCATE (rec_col_row_info(0:max_row_col_local, 2))

      ! accumulate data on C buffer starting from myself
      DO iiB = 1, nrow_local
         i_global = row_indices(iiB)
         IF (i_global >= my_mu_start .AND. i_global <= my_mu_end) THEN
            DO jjB = 1, ncol_local
               j_global = col_indices(jjB)
               C(i_global - my_mu_start + 1, j_global) = local_C(iiB, jjB)
            END DO
         END IF
      END DO

      ! start ring communication for collecting the data from the other
      proc_send_static = MODULO(para_env%mepos + 1, para_env%num_pe)
      proc_receive_static = MODULO(para_env%mepos - 1, para_env%num_pe)
      DO proc_shift = 1, para_env%num_pe - 1
         ! first exchange information on the local data
         rec_col_row_info = 0
         CALL para_env%sendrecv(local_col_row_info, proc_send_static, rec_col_row_info, proc_receive_static)
         nrow_rec = rec_col_row_info(0, 1)
         ncol_rec = rec_col_row_info(0, 2)

         ALLOCATE (row_indices_rec(nrow_rec))
         row_indices_rec = rec_col_row_info(1:nrow_rec, 1)

         ALLOCATE (col_indices_rec(ncol_rec))
         col_indices_rec = rec_col_row_info(1:ncol_rec, 2)

         ALLOCATE (rec_C(nrow_rec, ncol_rec))
         rec_C = 0.0_dp

         ! then send and receive the real data
         CALL para_env%sendrecv(local_C, proc_send_static, rec_C, proc_receive_static)

         ! accumulate the received data on C buffer
         DO iiB = 1, nrow_rec
            i_global = row_indices_rec(iiB)
            IF (i_global >= my_mu_start .AND. i_global <= my_mu_end) THEN
               DO jjB = 1, ncol_rec
                  j_global = col_indices_rec(jjB)
                  C(i_global - my_mu_start + 1, j_global) = rec_C(iiB, jjB)
               END DO
            END IF
         END DO

         local_col_row_info(:, :) = rec_col_row_info
         DEALLOCATE (local_C)
         ALLOCATE (local_C(nrow_rec, ncol_rec))
         local_C(:, :) = rec_C

         DEALLOCATE (col_indices_rec)
         DEALLOCATE (row_indices_rec)
         DEALLOCATE (rec_C)
      END DO

      DEALLOCATE (local_C)
      DEALLOCATE (local_col_row_info)
      DEALLOCATE (rec_col_row_info)

      CALL timestop(handle)

   END SUBROUTINE grep_rows_in_subgroups

! **************************************************************************************************
!> \brief Encapsulate the building of dbcsr_matrices mo_coeff_(v,o,all)
!> \param para_env_sub ...
!> \param mo_coeff_to_build ...
!> \param Cread ...
!> \param mat_munu ...
!> \param gd_array ...
!> \param eps_filter ...
!> \author Jan Wilhelm, Code by Mauro Del Ben
! **************************************************************************************************
   SUBROUTINE build_dbcsr_from_rows(para_env_sub, mo_coeff_to_build, Cread, &
                                    mat_munu, gd_array, eps_filter)
      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub
      TYPE(dbcsr_type)                                   :: mo_coeff_to_build
      REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Cread
      TYPE(dbcsr_type), INTENT(INOUT)                    :: mat_munu
      TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
      REAL(KIND=dp), INTENT(IN)                          :: eps_filter

      CHARACTER(LEN=*), PARAMETER :: routineN = 'build_dbcsr_from_rows'

      INTEGER :: col, col_offset, col_size, handle, i, i_global, j, j_global, my_mu_end, &
         my_mu_start, ncol_global, proc_receive, proc_send, proc_shift, rec_mu_end, rec_mu_size, &
         rec_mu_start, row, row_offset, row_size
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: rec_C
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: data_block
      TYPE(dbcsr_iterator_type)                          :: iter

      CALL timeset(routineN, handle)

      ncol_global = SIZE(Cread, 2)

      CALL get_group_dist(gd_array, para_env_sub%mepos, my_mu_start, my_mu_end)

      CALL cp_dbcsr_m_by_n_from_row_template(mo_coeff_to_build, template=mat_munu, n=ncol_global, &
                                             sym=dbcsr_type_no_symmetry)
      CALL dbcsr_reserve_all_blocks(mo_coeff_to_build)

      ! accumulate data on mo_coeff_to_build starting from myself
      CALL dbcsr_iterator_start(iter, mo_coeff_to_build)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, data_block, &
                                        row_size=row_size, col_size=col_size, &
                                        row_offset=row_offset, col_offset=col_offset)
         DO i = 1, row_size
            i_global = row_offset + i - 1
            IF (i_global >= my_mu_start .AND. i_global <= my_mu_end) THEN
               DO j = 1, col_size
                  j_global = col_offset + j - 1
                  data_block(i, j) = Cread(i_global - my_mu_start + 1, col_offset + j - 1)
               END DO
            END IF
         END DO
      END DO
      CALL dbcsr_iterator_stop(iter)

      ! start ring communication in the subgroup for collecting the data from the other
      ! proc (occupied)
      DO proc_shift = 1, para_env_sub%num_pe - 1
         proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
         proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)

         CALL get_group_dist(gd_array, proc_receive, rec_mu_start, rec_mu_end, rec_mu_size)

         ALLOCATE (rec_C(rec_mu_size, ncol_global))
         rec_C = 0.0_dp

         ! then send and receive the real data
         CALL para_env_sub%sendrecv(Cread, proc_send, rec_C, proc_receive)

         ! accumulate data on mo_coeff_to_build the data received from proc_rec
         CALL dbcsr_iterator_start(iter, mo_coeff_to_build)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, data_block, &
                                           row_size=row_size, col_size=col_size, &
                                           row_offset=row_offset, col_offset=col_offset)
            DO i = 1, row_size
               i_global = row_offset + i - 1
               IF (i_global >= rec_mu_start .AND. i_global <= rec_mu_end) THEN
                  DO j = 1, col_size
                     j_global = col_offset + j - 1
                     data_block(i, j) = rec_C(i_global - rec_mu_start + 1, col_offset + j - 1)
                  END DO
               END IF
            END DO
         END DO
         CALL dbcsr_iterator_stop(iter)

         DEALLOCATE (rec_C)

      END DO
      CALL dbcsr_filter(mo_coeff_to_build, eps_filter)

      CALL timestop(handle)

   END SUBROUTINE build_dbcsr_from_rows

! **************************************************************************************************
!> \brief Encapsulate the building of dbcsr_matrix mat_munu
!> \param mat_munu ...
!> \param qs_env ...
!> \param eps_grid ...
!> \param blacs_env_sub ...
!> \param do_ri_aux_basis ...
!> \param do_mixed_basis ...
!> \param group_size_prim ...
!> \param do_alloc_blocks_from_nbl ...
!> \param do_kpoints ...
!> \param sab_orb_sub ...
!> \param dbcsr_sym_type ...
!> \author Jan Wilhelm, code by Mauro Del Ben
! **************************************************************************************************
   SUBROUTINE create_mat_munu(mat_munu, qs_env, eps_grid, blacs_env_sub, &
                              do_ri_aux_basis, do_mixed_basis, group_size_prim, &
                              do_alloc_blocks_from_nbl, do_kpoints, sab_orb_sub, dbcsr_sym_type)

      TYPE(dbcsr_p_type), INTENT(OUT)                    :: mat_munu
      TYPE(qs_environment_type), POINTER                 :: qs_env
      REAL(KIND=dp)                                      :: eps_grid
      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
      LOGICAL, INTENT(IN), OPTIONAL                      :: do_ri_aux_basis, do_mixed_basis
      INTEGER, INTENT(IN), OPTIONAL                      :: group_size_prim
      LOGICAL, INTENT(IN), OPTIONAL                      :: do_alloc_blocks_from_nbl, do_kpoints
      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
         OPTIONAL, POINTER                               :: sab_orb_sub
      CHARACTER, OPTIONAL                                :: dbcsr_sym_type

      CHARACTER(LEN=*), PARAMETER                        :: routineN = 'create_mat_munu'

      CHARACTER                                          :: my_dbcsr_sym_type
      INTEGER                                            :: handle, ikind, natom, nkind
      INTEGER, DIMENSION(:), POINTER                     :: col_blk_sizes, row_blk_sizes
      LOGICAL                                            :: my_do_alloc_blocks_from_nbl, &
                                                            my_do_kpoints, my_do_mixed_basis, &
                                                            my_do_ri_aux_basis
      LOGICAL, ALLOCATABLE, DIMENSION(:)                 :: orb_present
      REAL(dp), ALLOCATABLE, DIMENSION(:)                :: orb_radius
      REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: pair_radius
      REAL(KIND=dp)                                      :: subcells
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dbcsr_distribution_type), POINTER             :: dbcsr_dist_sub
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(distribution_1d_type), POINTER                :: local_molecules_sub, local_particles_sub
      TYPE(distribution_2d_type), POINTER                :: distribution_2d_sub
      TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER  :: basis_set_ri_aux
      TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
      TYPE(local_atoms_type), ALLOCATABLE, DIMENSION(:)  :: atom2d
      TYPE(molecule_kind_type), DIMENSION(:), POINTER    :: molecule_kind_set
      TYPE(molecule_type), DIMENSION(:), POINTER         :: molecule_set
      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
         POINTER                                         :: my_sab_orb_sub
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set

      CALL timeset(routineN, handle)

      NULLIFY (basis_set_ri_aux)

      my_do_ri_aux_basis = .FALSE.
      IF (PRESENT(do_ri_aux_basis)) THEN
         my_do_ri_aux_basis = do_ri_aux_basis
      END IF

      my_do_mixed_basis = .FALSE.
      IF (PRESENT(do_mixed_basis)) THEN
         my_do_mixed_basis = do_mixed_basis
      END IF

      my_do_alloc_blocks_from_nbl = .FALSE.
      IF (PRESENT(do_alloc_blocks_from_nbl)) THEN
         my_do_alloc_blocks_from_nbl = do_alloc_blocks_from_nbl
      END IF

      my_do_kpoints = .FALSE.
      IF (PRESENT(do_kpoints)) THEN
         my_do_kpoints = do_kpoints
      END IF

      my_dbcsr_sym_type = dbcsr_type_no_symmetry
      IF (PRESENT(dbcsr_sym_type)) THEN
         my_dbcsr_sym_type = dbcsr_sym_type
      END IF

      CALL get_qs_env(qs_env, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      particle_set=particle_set, &
                      atomic_kind_set=atomic_kind_set, &
                      molecule_set=molecule_set, &
                      molecule_kind_set=molecule_kind_set, &
                      dft_control=dft_control)

      IF (my_do_kpoints) THEN
         ! please choose EPS_PGF_ORB in QS section smaller than EPS_GRID in WFC_GPW section
         IF (eps_grid < dft_control%qs_control%eps_pgf_orb) THEN
            eps_grid = dft_control%qs_control%eps_pgf_orb
            CPWARN("WFC_GPW%EPS_GRID has been set to QS%EPS_PGF_ORB")
         END IF
      END IF

      ! hack hack hack XXXXXXXXXXXXXXX ... to be fixed
      dft_control%qs_control%eps_pgf_orb = eps_grid
      dft_control%qs_control%eps_rho_rspace = eps_grid
      dft_control%qs_control%eps_gvg_rspace = eps_grid
      CALL init_interaction_radii(dft_control%qs_control, qs_kind_set)

      ! get a distribution_1d
      NULLIFY (local_particles_sub, local_molecules_sub)
      CALL distribute_molecules_1d(atomic_kind_set=atomic_kind_set, &
                                   particle_set=particle_set, &
                                   local_particles=local_particles_sub, &
                                   molecule_kind_set=molecule_kind_set, &
                                   molecule_set=molecule_set, &
                                   local_molecules=local_molecules_sub, &
                                   force_env_section=qs_env%input)

      ! get a distribution_2d
      NULLIFY (distribution_2d_sub)
      CALL distribute_molecules_2d(cell=cell, &
                                   atomic_kind_set=atomic_kind_set, &
                                   qs_kind_set=qs_kind_set, &
                                   particle_set=particle_set, &
                                   molecule_kind_set=molecule_kind_set, &
                                   molecule_set=molecule_set, &
                                   distribution_2d=distribution_2d_sub, &
                                   blacs_env=blacs_env_sub, &
                                   force_env_section=qs_env%input)

      ! Build the sub orbital-orbital overlap neighbor lists
      CALL section_vals_val_get(qs_env%input, "DFT%SUBCELLS", r_val=subcells)
      nkind = SIZE(atomic_kind_set)
      ALLOCATE (atom2d(nkind))

      CALL atom2d_build(atom2d, local_particles_sub, distribution_2d_sub, atomic_kind_set, &
                        molecule_set, molecule_only=.FALSE., particle_set=particle_set)

      ALLOCATE (orb_present(nkind))
      ALLOCATE (orb_radius(nkind))
      ALLOCATE (pair_radius(nkind, nkind))

      DO ikind = 1, nkind
         CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set)
         IF (ASSOCIATED(orb_basis_set)) THEN
            orb_present(ikind) = .TRUE.
            CALL get_gto_basis_set(gto_basis_set=orb_basis_set, kind_radius=orb_radius(ikind))
         ELSE
            orb_present(ikind) = .FALSE.
            orb_radius(ikind) = 0.0_dp
         END IF
      END DO

      CALL pair_radius_setup(orb_present, orb_present, orb_radius, orb_radius, pair_radius)

      IF (PRESENT(sab_orb_sub)) THEN
         NULLIFY (sab_orb_sub)
         ! for cubic RPA/GW with kpoints, we need all neighbors and not only the symmetric ones
         IF (my_do_kpoints) THEN
            CALL build_neighbor_lists(sab_orb_sub, particle_set, atom2d, cell, pair_radius, &
                                      mic=.FALSE., subcells=subcells, molecular=.FALSE., nlname="sab_orb_sub", &
                                      symmetric=.FALSE.)
         ELSE
            CALL build_neighbor_lists(sab_orb_sub, particle_set, atom2d, cell, pair_radius, &
                                      mic=.FALSE., subcells=subcells, molecular=.FALSE., nlname="sab_orb_sub")
         END IF
      ELSE
         NULLIFY (my_sab_orb_sub)
         ! for cubic RPA/GW with kpoints, we need all neighbors and not only the symmetric ones
         IF (my_do_kpoints) THEN
            CALL build_neighbor_lists(my_sab_orb_sub, particle_set, atom2d, cell, pair_radius, &
                                      mic=.FALSE., subcells=subcells, molecular=.FALSE., nlname="sab_orb_sub", &
                                      symmetric=.FALSE.)
         ELSE
            CALL build_neighbor_lists(my_sab_orb_sub, particle_set, atom2d, cell, pair_radius, &
                                      mic=.FALSE., subcells=subcells, molecular=.FALSE., nlname="sab_orb_sub")
         END IF
      END IF
      CALL atom2d_cleanup(atom2d)
      DEALLOCATE (atom2d)
      DEALLOCATE (orb_present, orb_radius, pair_radius)

      ! a dbcsr_dist
      ALLOCATE (dbcsr_dist_sub)
      CALL cp_dbcsr_dist2d_to_dist(distribution_2d_sub, dbcsr_dist_sub)

      ! build a dbcsr matrix the hard way
      natom = SIZE(particle_set)
      ALLOCATE (row_blk_sizes(natom))
      IF (my_do_ri_aux_basis) THEN

         ALLOCATE (basis_set_ri_aux(nkind))
         CALL basis_set_list_setup(basis_set_ri_aux, "RI_AUX", qs_kind_set)
         CALL get_particle_set(particle_set, qs_kind_set, nsgf=row_blk_sizes, basis=basis_set_ri_aux)
         DEALLOCATE (basis_set_ri_aux)

      ELSE IF (my_do_mixed_basis) THEN

         ALLOCATE (basis_set_ri_aux(nkind))
         CALL basis_set_list_setup(basis_set_ri_aux, "RI_AUX", qs_kind_set)
         CALL get_particle_set(particle_set, qs_kind_set, nsgf=row_blk_sizes, basis=basis_set_ri_aux)
         DEALLOCATE (basis_set_ri_aux)

         ALLOCATE (col_blk_sizes(natom))

         CALL get_particle_set(particle_set, qs_kind_set, nsgf=col_blk_sizes)
         col_blk_sizes = col_blk_sizes*group_size_prim

      ELSE
         CALL get_particle_set(particle_set, qs_kind_set, nsgf=row_blk_sizes)
      END IF

      NULLIFY (mat_munu%matrix)
      ALLOCATE (mat_munu%matrix)

      IF (my_do_ri_aux_basis) THEN

         CALL dbcsr_create(matrix=mat_munu%matrix, &
                           name="(ai|munu)", &
                           dist=dbcsr_dist_sub, matrix_type=my_dbcsr_sym_type, &
                           row_blk_size=row_blk_sizes, col_blk_size=row_blk_sizes)

      ELSE IF (my_do_mixed_basis) THEN

         CALL dbcsr_create(matrix=mat_munu%matrix, &
                           name="(ai|munu)", &
                           dist=dbcsr_dist_sub, matrix_type=my_dbcsr_sym_type, &
                           row_blk_size=row_blk_sizes, col_blk_size=col_blk_sizes)

      ELSE

         CALL dbcsr_create(matrix=mat_munu%matrix, &
                           name="(ai|munu)", &
                           dist=dbcsr_dist_sub, matrix_type=my_dbcsr_sym_type, &
                           row_blk_size=row_blk_sizes, col_blk_size=row_blk_sizes)

         IF (my_do_alloc_blocks_from_nbl) THEN

            IF (PRESENT(sab_orb_sub)) THEN
               CALL cp_dbcsr_alloc_block_from_nbl(mat_munu%matrix, sab_orb_sub)
            ELSE
               CALL cp_dbcsr_alloc_block_from_nbl(mat_munu%matrix, my_sab_orb_sub)
            END IF

         END IF

      END IF

      DEALLOCATE (row_blk_sizes)

      IF (my_do_mixed_basis) THEN
         DEALLOCATE (col_blk_sizes)
      END IF

      CALL dbcsr_distribution_release(dbcsr_dist_sub)
      DEALLOCATE (dbcsr_dist_sub)

      CALL distribution_2d_release(distribution_2d_sub)

      CALL distribution_1d_release(local_particles_sub)
      CALL distribution_1d_release(local_molecules_sub)

      IF (.NOT. PRESENT(sab_orb_sub)) THEN
         CALL release_neighbor_list_sets(my_sab_orb_sub)
      END IF

      CALL timestop(handle)

   END SUBROUTINE create_mat_munu

! **************************************************************************************************
!> \brief ...
!> \param mat_P_global ...
!> \param qs_env ...
!> \param mp2_env ...
!> \param para_env ...
! **************************************************************************************************
   SUBROUTINE create_matrix_P(mat_P_global, qs_env, mp2_env, para_env)

      TYPE(dbcsr_p_type), INTENT(OUT)                    :: mat_P_global
      TYPE(qs_environment_type), POINTER                 :: qs_env
      TYPE(mp2_type)                                     :: mp2_env
      TYPE(mp_para_env_type), POINTER                    :: para_env

      CHARACTER(LEN=*), PARAMETER                        :: routineN = 'create_matrix_P'

      INTEGER                                            :: blacs_grid_layout, handle
      LOGICAL                                            :: blacs_repeatable
      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_global

      CALL timeset(routineN, handle)

      blacs_grid_layout = BLACS_GRID_SQUARE
      blacs_repeatable = .TRUE.
      NULLIFY (blacs_env_global)
      CALL cp_blacs_env_create(blacs_env_global, para_env, &
                               blacs_grid_layout, &
                               blacs_repeatable)

      CALL create_mat_munu(mat_P_global, qs_env, mp2_env%mp2_gpw%eps_grid, &
                           blacs_env_global, do_ri_aux_basis=.TRUE., &
                           do_kpoints=mp2_env%ri_rpa_im_time%do_im_time_kpoints)

      CALL dbcsr_reserve_all_blocks(mat_P_global%matrix)
      CALL cp_blacs_env_release(blacs_env_global)

      CALL timestop(handle)

   END SUBROUTINE create_matrix_P

! **************************************************************************************************
!> \brief ...
!> \param dft_control ...
!> \param eps_pgf_orb_old ...
!> \param eps_rho_rspace_old ...
!> \param eps_gvg_rspace_old ...
! **************************************************************************************************
   PURE SUBROUTINE get_eps_old(dft_control, eps_pgf_orb_old, eps_rho_rspace_old, eps_gvg_rspace_old)

      TYPE(dft_control_type), INTENT(INOUT)              :: dft_control
      REAL(kind=dp), INTENT(OUT)                         :: eps_pgf_orb_old, eps_rho_rspace_old, &
                                                            eps_gvg_rspace_old

      ! re-init the radii to be able to generate pair lists with MP2-appropriate screening
      eps_pgf_orb_old = dft_control%qs_control%eps_pgf_orb
      eps_rho_rspace_old = dft_control%qs_control%eps_rho_rspace
      eps_gvg_rspace_old = dft_control%qs_control%eps_gvg_rspace

   END SUBROUTINE get_eps_old

END MODULE mp2_gpw
